Treball de Final de MÃ ster
ANÀLISI DEL DIA MUNDIAL DE LES MALALTIES MINORITÀRIES (Font de dades: TWITTER).
En aquest Jupyter Notebook, veurem com aplicar els diferents tipus d'enllaços possibles per l'algorisme jerà rquic aglomeratiu.
S'apliquen sobre la mateixa matriu de vectorització millorada ja considerada amb l'algorisme KMeans, que resumim tot seguit:
Les mesures d'optimització aplicades són:
Passem de considerar 98433 tuits a considerar 12408 sense perdre informació de contingut.
Passem d'un vocabulari de 19603 paraules a 213, amb la possible pèrdua de continguts o temà tiques minoritaris interessants.
Considerem les temà tiques a clústers definides per els mots més significatius, per tant també a part es podrien concloure temà tiques globals respecte
el conjunt total de clústers.
# Descripció de les llibreries Python utilitzades:
import numpy as np
# NumPy és un paquet de processament de matrius de propòsit general.
# Proporciona un objecte de matriu multidimensional d’alt rendiment i eines per treballar amb aquestes matrius.
# És el paquet fonamental per a la computació cientÃfica amb Python.
import pandas as pd
# Pandas és l’eina per treballar amb dades tabulars: dades emmagatzemades en fulls de cà lcul o bases de dades.
# Permet explorar, netejar i processar dades tabulars usant l'objecte DataFrame.
# Operations Seleccionar, filtrar per files o columnes o per una condition i exportar les dades o visualitzar-les.
from sklearn.feature_extraction.text import TfidfVectorizer
# Sklearn és un paquet/llibreria indicat per aplicar Aprenentatge Automà tic.
# Per l'extracció de caracterÃstiques s'utilitza l'objecte
# TfidfVectorizer per representar numericament mitjançant vectors dades texte.
from sklearn.cluster import KMeans, DBSCAN
# Per executar les implementacions dels algorismes de KMeans i DBSCAN
# utilitzem el mòdul 'cluster' de la llibreria Sklearn.
from sklearn.neighbors import NearestNeighbors
# Implementació de l'algorisme K-Nearest Neighbors
# per l'òptimització del parà metre eps en l'execució
# de l'algorisme DBSCAN.
from sklearn.metrics.pairwise import cosine_similarity
# Implentació de la mètrica de similitud de vectors.
# S'ha usat per comparar aquesta mètrica de vectors
# amb la mètrica de distà ncia euclidiana.
from sklearn.cluster import AgglomerativeClustering
# Implementació de l'algorisme jerà rquic de tipus Down Up
# o aglomeratiu. Finalment s'han comparat els resultats amb
# els resultats calculats amb la llibreria SCIPY.
import matplotlib.pyplot as plt
# matplotlib.pyplot permet la visualització grà fica, controlant tots
# els aspectes d'una grà fica des de la plantilla, tÃtols, eixos,
# representació de les dades, reixeta, etiquetes etc....
from matplotlib.ticker import FormatStrFormatter
import matplotlib.ticker as ticker
# Llibreries usades com a complement de la llibreria matplotlib.pyplot.
# Tractament especÃfic dels eixos en la definició i
# visualització de l'histograma resultat d'un agrupament.
import time
# Utilitzat per el cà lcul de durada dels processos.
import delayedsparse
# Implementació eficient de matrius disperses per a diverses
# anà lisis de components principals PCA. En concret ho apliquem
# per aplicar PCA a una matriu dispersa resultat de vectoritzar
# amb l'objecte TfidfVectorizer un volum de dades gran.
import scipy.sparse
# Llibreria per la manipulació de matrius disperses
# SciPy 2-D per a dades numèriques.
import re, collections
# Llibreries
# 're': usada per la definició, manipulació i tractament de
# text mitjançant 'expressions regulars'.
# 'collections': usada per el conteig de paraules en textos.
# i per la implementació del 'Ba of Words' (BoW)
from textblob import TextBlob, Word
# Llibreria de soport de l'anà lisi de sentiment
# en paraules i frases, en concret s'ha aplicat al
# cà lcul de la polaritat i subjectivitat en textos.
from nltk.tokenize import TweetTokenizer
from nltk.stem import LancasterStemmer, PorterStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
# NLTK: LLibreria formada per un conjunt de mòduls per el
# tractament i manipulació de textos i tractament del llemguatge
# natural en general. En concret s'han usat en la fase de preprocessat.
# TweetTokenizer: Usat per obtenir els elements representatius o tokens.
# en concret aquest mòdul és especific per textos de Twitter.
# LancasterStemmer, PorterStemmer: Mòduls valorats per l'operació de
# steaming on s'obté el mot arrel i s'en descarten els derivats d'ell.
# Stopwords: Eliminació de les paraules freqüents por significatives
# sovint usades en el llenguatge per l'unió de frases o de complement
# a substantius i verbs.
# FreqDist: S'ha usat per el cà lcul rà pid del conteig
# de paraules o 'Bagg of words' (Bow).
from scipy.cluster.hierarchy import dendrogram, linkage, single, complete, ward
import scipy.cluster.hierarchy as hc
import scipy.spatial.distance as metrica
# Conjunt de llibreries per el cà lcul dels diferents
# tipus d'enllaç en l'aplicació del algorisme jerà rquic aglomeratiu
# i les representacions mitjançant un dendrograma associades.
import fastcluster
# Alternativa a slearn al cà lcul dels algorismes
# d'agrupament i la seva representació.
from itertools import cycle, islice
# Llibreries optimitzades per la implementació
# d'iteracions eficients.
# Llegir el dataset
time_start = time.time()
tuits = pd.read_excel("c:/users/qdeda/TFM_Code/DMMM_dataset_Final.xlsx")
temps=(time.time()-time_start)/60
print("CÃ rrega del Dataset: {} observacions. \nDurada: {} minuts {} segons." \
.format(tuits.shape[0],int(temps) if temps>0 else 0,int((temps-int(temps))*60)))
# Transformacions:
t=tuits.drop(tuits[tuits.text_y.str.startswith('RT')].index)
# Juntem tuits d'un mateix autor.
t1 = t[['text_net','text_Norm','user_idstr']].groupby(['user_idstr']).agg(' '.join)
# Mitjana de polaritats i subjectivitats de tots els tuits
pol_subj = t[['user_idstr','polarity','subjectivity']].groupby(['user_idstr']).agg('mean')
# Eliminen paraules repetides
for row in t1.iterrows():
tmp1=' '.join(set(row[1]['text_net'].split()))
row[1]['text_net']=tmp1
tmp2=' ' .join(set(row[1]['text_Norm'].split()))
row[1]['text_Norm']=tmp2
# Reconstruim el dataset indexat per autor, perdem les identitats dels tuits
# tenim usuaris / paraules al construir el vector tfidf.
tuits = pd.merge(t1, pol_subj, left_on=t1.index, right_on=pol_subj.index, how='inner')
tuits.columns=['autor','text','text_norm','polaritat','subjectivitat']
# observem que les paraules obvies de hashtags queden per eliminar
excloure=['rare','day','today','february','disease','diseases','world']
for p in excloure:
tuits.text=tuits.text.str.replace(p,"")
tuits.text
tuits=tuits.drop(tuits[tuits.text.str.len()==0].index)
tuits.reset_index(drop=True, inplace=True)
t=[]
t1=[]
t2=[]
tuits.head()
tfidf_vect = TfidfVectorizer()
matriu_tfidf = tfidf_vect.fit_transform(tuits.text)
matriu_tfidf.shape
tfidf_vect = TfidfVectorizer(min_df=0.01)
matriu_tfidf = tfidf_vect.fit_transform(tuits.text)
matriu_tfidf.shape
sorted(tfidf_vect.vocabulary_.items(), key=lambda x:x[1])
# filtrem aquells tuits amb subjectivitat >0.5
tuits_subj=tuits[tuits.subjectivitat>0.5]
tuits_subj.shape
# tuits subjectius positivament
tuits_pos=tuits_subj[tuits_subj.polaritat>=0.5]
print(tuits_pos.shape)
# tuits subjectius negativament
tuits_neg=tuits_subj[tuits_subj.polaritat<-0.5]
tuits_neg.shape
tfidf_vect_pos = TfidfVectorizer()
matriu_tfidf_pos = tfidf_vect_pos.fit_transform(tuits_pos.text)
tfidf_vect_neg = TfidfVectorizer()
matriu_tfidf_neg = tfidf_vect_neg.fit_transform(tuits_neg.text)
print(matriu_tfidf_pos.shape)
print(matriu_tfidf_neg.shape)
# Similarity
similaritat = 1 - cosine_similarity(matriu_tfidf)
similaritat.shape
def tokens_mes_propers(vector_clusters, vectorizer, mat_vect, topk=10):
paraules = vectorizer.get_feature_names()
relevant_labels = set(vector_clusters)
for this_label in relevant_labels:
matching_rows = np.where(vector_clusters == this_label)[0]
coeff_sums = np.sum(mat_vect[matching_rows], axis=0).A1
sorted_coeff_idxs = np.argsort(coeff_sums)[::-1]
print('Cluster {}: '.format(this_label), end='')
for idx in sorted_coeff_idxs[:topk]:
print('{} '.format(paraules[idx]), end='')
print()
# Reducció de la dimensionalitat amb PCA:
Xz = matriu_tfidf
n_comp=2
print("\nCreant el model i matriu PCA - Nº Components=",n_comp)
pca=delayedsparse.PCA(n_components=n_comp)
X_PCA = pca.fit(Xz).transform(Xz)
print("Dimensions de les dades reduïdes amb PCA:", np.shape(X_PCA))
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus ward (Minimització variança):
lnk="Ward"
plt.figure(figsize=(12,8))
time_start = time.time()
Z1=hc.ward(X_PCA)
dn=dendrogram(Z1,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
temps=(time.time()-time_start)/60
plt.title("Dendrograma Enllaç tipus {}".format(lnk))
plt.savefig("dn_ward.jpg",format='jpg',bbox_inches='tight')
plt.show()
print("#Visualització dendrograma (PCA):",np.shape(X_PCA),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='ward'
num_clusters=3
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-ward_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=3
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='ward'
num_clusters=6
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-ward_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=6
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
num_clusters=8
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
lnk='ward'
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-ward_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=8
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
num_clusters=10
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
lnk='ward'
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-ward_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=10
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
num_clusters=13
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
lnk='ward'
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-ward_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=13
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
num_clusters=15
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
lnk='ward'
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-ward_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=15
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
files=2
cols=3
lnk='ward'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_ward.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='ward'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample,alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z1, num_clusters, criterion='maxclust')-1
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.0d'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_ward.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus single:
lnk="Simple"
plt.figure(figsize=(12,8))
time_start = time.time()
Z2=hc.single(X_PCA)
dn=dendrogram(Z2,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
plt.title("Dendrograma Enllaç tipus {}".format(lnk))
plt.savefig("dn_single.jpg",format='jpg',bbox_inches='tight')
plt.show()
temps=(time.time()-time_start)/60
print("#Visualització dendrograma (PCA):",np.shape(X_PCA),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='single'
num_clusters=3
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-single_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=3
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='single'
num_clusters=6
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-single_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=6
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='single'
num_clusters=8
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-single_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=8
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='single'
num_clusters=10
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-single_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=10
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='single'
num_clusters=13
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-single_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=13
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='single'
num_clusters=15
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-single_{}.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=15
tall=hc.fcluster(Z2, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
files=2
cols=3
lnk='single'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.cut_tree(Z2, n_clusters=[num_clusters]).ravel()
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_single.jpg",format='jpg',bbox_inches='tight')
# Histogrames
files=2
cols=3
lnk='single'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample,alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.cut_tree(Z2, n_clusters=[num_clusters]).ravel()
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_single.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus complete:
lnk="Complet"
plt.figure(figsize=(12,8))
time_start = time.time()
Z3=hc.complete(X_PCA)
dn=dendrogram(Z3,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
plt.title("Dendrograma Enllaç tipus {}".format(lnk))
plt.savefig("dn_complete.jpg",format='jpg',bbox_inches='tight')
plt.show()
temps=(time.time()-time_start)/60
print("#Visualització dendrograma (PCA):",np.shape(X_PCA),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='complete'
num_clusters=3
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=3
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='complete'
num_clusters=6
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=6
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='complete'
num_clusters=8
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=8
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='complete'
num_clusters=10
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=10
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='complete'
num_clusters=13
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=13
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='complete'
num_clusters=15
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=15
tall=hc.fcluster(Z3, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
files=2
cols=3
lnk='complete'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.cut_tree(Z3, n_clusters=[num_clusters]).ravel()
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_complete.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Complet'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.cut_tree(Z3, n_clusters=[num_clusters]).ravel()
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_complet.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus complete:
lnk="Mitjana (Average)"
plt.figure(figsize=(12,8))
time_start = time.time()
Z4=hc.average(X_PCA)
dn=dendrogram(Z4,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
plt.title("Dendrograma Enllaç tipus {}".format(lnk))
plt.savefig("dn_average.jpg",format='jpg',bbox_inches='tight')
plt.show()
temps=(time.time()-time_start)/60
print("#Visualització dendrograma (PCA):",np.shape(X_PCA),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='average'
num_clusters=3
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=3
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='average'
num_clusters=6
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=6
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='average'
num_clusters=8
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=8
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='average'
num_clusters=10
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=10
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='average'
num_clusters=13
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=13
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='average'
num_clusters=15
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=15
tall=hc.fcluster(Z4, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
files=2
cols=3
lnk='average'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.cut_tree(Z4, n_clusters=[num_clusters]).ravel()
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_average.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Average'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.cut_tree(Z4, n_clusters=[num_clusters]).ravel()
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_average.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus complete:
lnk="Ponderat (Weighted)"
plt.figure(figsize=(12,8))
time_start = time.time()
Z5=hc.weighted(X_PCA)
dn=dendrogram(Z5,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
plt.title("Dendrograma Enllaç tipus {}".format(lnk))
plt.tight_layout()
plt.savefig("dn_weighted.jpg",format='jpg',bbox_inches='tight')
temps=(time.time()-time_start)/60
print("#Visualització dendrograma (PCA):",np.shape(X_PCA),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='Weighted'
num_clusters=3
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=3
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Weighted'
num_clusters=6
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=6
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Weighted'
num_clusters=8
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=8
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Weighted'
num_clusters=10
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=10
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Weighted'
num_clusters=13
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=13
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Weighted'
num_clusters=15
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=15
tall=hc.fcluster(Z5, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
files=2
cols=3
lnk='weighted'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.cut_tree(Z5, n_clusters=[num_clusters]).ravel()
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_weighted.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Weighted'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.cut_tree(Z5, n_clusters=[num_clusters]).ravel()
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_weighted.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus complete:
lnk="Centroide"
plt.figure(figsize=(12,8))
time_start = time.time()
Z6=hc.centroid(X_PCA)
dn=dendrogram(Z6,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
plt.title("Dendrograma Enllaç tipus {}".format(lnk))
plt.savefig("dn_centroid.jpg",format='jpg',bbox_inches='tight')
plt.show()
temps=(time.time()-time_start)/60
print("#Visualització dendrograma (PCA):",np.shape(X_PCA),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='Centroide'
num_clusters=3
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=3
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Centroide'
num_clusters=6
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=6
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Centroide'
num_clusters=8
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=8
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Centroide'
num_clusters=10
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=10
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Centroide'
num_clusters=13
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=13
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Centroide'
num_clusters=15
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=15
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
files=2
cols=3
lnk='centroid'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
# Algorisme jerà rquic bottom-up
#ahc = AgglomerativeClustering(n_clusters=num_clusters,linkage=lnk).fit(X_PCA)
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_centroid.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Centroide'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z6, num_clusters, criterion='maxclust')-1
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_centroide.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# amb només les darreres 200 últimes agrupacions,
# usant un enllaç de tipus mediana:
lnk="Mediana"
plt.figure(figsize=(12,8))
time_start = time.time()
Z7=hc.median(X_PCA)
dn=dendrogram(Z7,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
plt.title("Dendrograma Enllaç tipus {}".format(lnk))
plt.savefig("dn_centroid.jpg",format='jpg',bbox_inches='tight')
plt.show()
temps=(time.time()-time_start)/60
print("#Visualització dendrograma (PCA):",np.shape(X_PCA),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='Mediana'
num_clusters=3
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=3
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Mediana'
num_clusters=6
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=6
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Mediana'
num_clusters=8
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=8
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Mediana'
num_clusters=10
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=10
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Mediana'
num_clusters=13
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=13
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='Mediana'
num_clusters=15
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(handletextpad=0,borderpad=0,markerscale=2)
plt.savefig("je-{}_{}.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=15
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
files=2
cols=3
lnk='Mediana'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
# Algorisme jerà rquic bottom-up
#ahc = AgglomerativeClustering(n_clusters=num_clusters,linkage=lnk).fit(X_PCA)
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA[:,0][tall==idcluster]), \
np.array(X_PCA[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_mediana.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Mediana'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_mediana.jpg",format='jpg',bbox_inches='tight')
# Similaritat del cosinus.
similaritat = 1 - cosine_similarity(matriu_tfidf)
similaritat.shape
# Reducció de la dimensionalitat amb PCA:
#pca = PCA(n_components=2)
Xz = similaritat
n_comp=2
print("\nCreant el model i matriu PCA - Nº Components=",n_comp)
pca=delayedsparse.PCA(n_components=n_comp)
X_PCA_cosine = pca.fit(Xz).transform(Xz)
print("Dimensions de les dades reduïdes amb PCA:", np.shape(X_PCA_cosine))
X_PCA_cosine
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus ward (Minimització variança):
lnk="Ward"
plt.figure(figsize=(12,8))
time_start = time.time()
Z1_c=hc.ward(X_PCA_cosine)
dn=dendrogram(Z1_c,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
temps=(time.time()-time_start)/60
plt.title("Dendrograma Enllaç tipus {}. Mètrica: similaritat del cosinus.".format(lnk))
plt.savefig("dn_ward_cosine.jpg",format='jpg',bbox_inches='tight')
plt.show()
print("#Visualització dendrograma (PCA):",np.shape(X_PCA_cosine),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='ward'
num_clusters=3
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-ward_{}_cosine.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='ward'
num_clusters=6
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-ward_{}_cosine.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
num_clusters=
tall=hc.fcluster(Z7, num_clusters, criterion='maxclust')-1
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf, 15)
lnk='ward'
num_clusters=8
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-ward_{}_cosine.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='ward'
num_clusters=10
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-ward_{}_cosine.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='ward'
num_clusters=13
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-ward_{}_cosine.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='ward'
num_clusters=15
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-ward_{}_cosine.jpg".format(num_clusters),format='jpg',bbox_inches='tight')
plt.show()
files=2
cols=3
lnk='Ward (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(loc='lower right', handletextpad=0, borderpad=0.7, markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Ward_cosine.jpg",format='jpg',bbox_inches='tight')
files=2
cols=2
lnk='Ward (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[200,300,500,1000]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
#ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Ward_cosine_200_300_500_1000.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Ward Cosinus'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z1_c, num_clusters, criterion='maxclust')-1
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_ward_cosine.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus ward (Minimització variança):
lnk="Single"
plt.figure(figsize=(12,8))
time_start = time.time()
Z2_c=hc.single(X_PCA_cosine)
dn=dendrogram(Z2_c,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
temps=(time.time()-time_start)/60
plt.title("Dendrograma Enllaç tipus {}. Mètrica: similaritat del cosinus.".format(lnk))
plt.savefig("dn_single_cosine.jpg",format='jpg',bbox_inches='tight')
plt.show()
print("#Visualització dendrograma (PCA):",np.shape(X_PCA_cosine),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='Single'
num_clusters=3
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Single'
num_clusters=6
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Single'
num_clusters=8
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Single'
num_clusters=10
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Single'
num_clusters=13
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Single'
num_clusters=15
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
files=2
cols=3
lnk='Single (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
# Algorisme jerà rquic bottom-up
#ahc = AgglomerativeClustering(n_clusters=num_clusters,linkage=lnk).fit(X_PCA)
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Single_cosine.jpg",format='jpg',bbox_inches='tight')
files=2
cols=2
lnk='Single (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[200,300,500,1000]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
#ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Single_cosine_200_300_500_1000.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Single Cosinus'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z2_c, num_clusters, criterion='maxclust')-1
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_single_cosine.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus ward (Minimització variança):
lnk="Complete"
plt.figure(figsize=(12,8))
time_start = time.time()
Z3_c=hc.complete(X_PCA_cosine)
dn=dendrogram(Z3_c,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
temps=(time.time()-time_start)/60
plt.title("Dendrograma Enllaç tipus {}. Mètrica: similaritat del cosinus.".format(lnk))
plt.savefig("dn_complete_cosine.jpg",format='jpg',bbox_inches='tight')
plt.show()
print("#Visualització dendrograma (PCA):",np.shape(X_PCA_cosine),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='Complete'
num_clusters=3
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Complete'
num_clusters=6
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Complete'
num_clusters=8
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Complete'
num_clusters=10
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Complete'
num_clusters=13
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Complete'
num_clusters=15
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
files=2
cols=3
lnk='Complet (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
# Algorisme jerà rquic bottom-up
#ahc = AgglomerativeClustering(n_clusters=num_clusters,linkage=lnk).fit(X_PCA)
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Complete_cosine.jpg",format='jpg',bbox_inches='tight')
files=2
cols=2
lnk='Complete (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[200,300,500,1000]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
#ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Complete_cosine_200_300_500_1000.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Complete Cosinus'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z3_c, num_clusters, criterion='maxclust')-1
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_complete_cosine.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus ward (Minimització variança):
lnk="Average"
plt.figure(figsize=(12,8))
time_start = time.time()
Z4_c=hc.average(X_PCA_cosine)
dn=dendrogram(Z4_c,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
temps=(time.time()-time_start)/60
plt.title("Dendrograma Enllaç tipus {}. Mètrica: similaritat del cosinus.".format(lnk))
plt.savefig("dn_average_cosine.jpg",format='jpg',bbox_inches='tight')
plt.show()
print("#Visualització dendrograma (PCA):",np.shape(X_PCA_cosine),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='Average'
num_clusters=3
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Average'
num_clusters=6
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Average'
num_clusters=8
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Average'
num_clusters=10
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Average'
num_clusters=13
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Average'
num_clusters=15
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
files=2
cols=3
lnk='Average (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
# Algorisme jerà rquic bottom-up
#ahc = AgglomerativeClustering(n_clusters=num_clusters,linkage=lnk).fit(X_PCA)
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Average_cosine.jpg",format='jpg',bbox_inches='tight')
files=2
cols=2
lnk='Average (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[200,300,500,1000]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
#ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Average_cosine_200_300_500_1000.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Average Cosinus'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_Average_cosine.jpg",format='jpg',bbox_inches='tight')
# Visualització de tota la jerarquÃa de clústers
# usant un enllaç de tipus ward (Minimització variança):
lnk="Centroide"
plt.figure(figsize=(12,8))
time_start = time.time()
Z6_c=hc.centroid(X_PCA_cosine)
dn=dendrogram(Z6_c,
truncate_mode='lastp', # Mostra només els últims p clústers units.
p=200, # valor de p
leaf_rotation=90., # rotació d'etiquetes en l'eix de les abcises.
leaf_font_size=None, # mida de la font en les etiquetes de l'eix de les abcises.
no_labels=True,
show_contracted=True # val True quan s'aplica el 'truncate_mode'.
)
temps=(time.time()-time_start)/60
plt.title("Dendrograma Enllaç tipus {}. Mètrica: similaritat del cosinus.".format(lnk))
plt.savefig("dn_average_cosine.jpg",format='jpg',bbox_inches='tight')
plt.show()
print("#Visualització dendrograma (PCA):",np.shape(X_PCA_cosine),"\n Durada: ",int(temps) if temps>0 else 0,"minut/s ", \
int((temps-int(temps))*60),"segons.")
lnk='Centroide'
num_clusters=3
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Centroide'
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Centroide'
num_clusters=8
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Centroide'
num_clusters=10
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Centroide'
num_clusters=13
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
lnk='Centroide'
num_clusters=15
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
fig, ax = plt.subplots(figsize=(12,8))
ax.set_title("Alg. Jerà rquics: {} - Enllaç: {} - mètrica:cosinus - Nº de clústers={}".format("Aglomeratiu",lnk,num_clusters))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax.scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=15, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax.legend(loc='lower right', handletextpad=0,borderpad=0.8,markerscale=2)
plt.savefig("je-{}_{}_cosine.jpg".format(lnk,num_clusters),format='jpg',bbox_inches='tight')
plt.show()
files=2
cols=3
lnk='Centroide (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
# Algorisme jerà rquic bottom-up
#ahc = AgglomerativeClustering(n_clusters=num_clusters,linkage=lnk).fit(X_PCA)
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Centroide_cosine.jpg",format='jpg',bbox_inches='tight')
files=2
cols=2
lnk='Centroide (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[200,300,500,1000]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
#ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
plt.savefig("last_Centroide_cosine_200_300_500_1000.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Centroide Cosinus'
alt=10
ample=22
m=0
fig, ax = plt.subplots(files,cols,figsize=(ample, alt))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z6_c, num_clusters, criterion='maxclust')-1
counts, bins, patches = ax[i,j].hist(tall, bins=np.array(range(0,num_clusters+1)), facecolor='skyblue', edgecolor='dodgerblue')
ax[i,j].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
ax[i,j].set_xticks(bin_centers)
ax[i,j].set_xticklabels(bins, rotation=0,color='b')
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
for count, x in zip(counts.astype(int), bin_centers):
# Label the raw counts
ax[i,j].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -20), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.0f%%' % (100 * float(count) / counts.sum())
ax[i,j].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -35), textcoords='offset points', va='top', ha='center',c='r')
m=m+1
plt.tight_layout()
plt.savefig("hist_Centroide_cosine.jpg",format='jpg',bbox_inches='tight')
files=2
cols=3
lnk='Average (Cosinus)'
alt=20
ample=12
m=0
fig, ax = plt.subplots(files,cols,figsize=(alt,ample))
llista_clusters=[3,6,8,10,13,15]
for i in range(0,files):
for j in range(0,cols):
num_clusters=llista_clusters[m]
tall=hc.fcluster(Z4_c, num_clusters, criterion='maxclust')-1
colors = np.array(list(islice(cycle(['dodgerblue', 'orange', 'forestgreen',
'lime', 'brown', 'mediumorchid',
'darkgrey', 'teal', 'y',
'blue','red','hotpink','black',
'magenta','aquamarine']),
int(len(set(tall))))))
# Algorisme jerà rquic bottom-up
#ahc = AgglomerativeClustering(n_clusters=num_clusters,linkage=lnk).fit(X_PCA)
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=num_clusters-1)
for idcluster in range(num_clusters):
ax[i,j].scatter(np.array(X_PCA_cosine[:,0][tall==idcluster]), \
np.array(X_PCA_cosine[:,1][tall==idcluster]), s=5, \
c = colors[idcluster], norm=norm, cmap=cmap, label="Cluster "+str(idcluster))
ax[i,j].set_title("Alg. Jerà rquics: {}\nEnllaç: {} #clústers={}".format("Aglomeratiu",lnk,num_clusters))
ax[i,j].legend(handletextpad=0,borderpad=0,markerscale=3)
m=m+1
plt.tight_layout()
tokens_mes_propers(tall, tfidf_vect, matriu_tfidf,15)